# -*- coding: utf-8 -*-
from datetime import datetime, timedelta, time
from hexunblog.items import HexunblogItem
import re
'''
博文信息对象服务类
'''


class ArticleInfoService:

    def __init__(self):
        pass

    '''
    根据一个日期字符串，获取日期对象
    '''
    @classmethod
    def get_date_with_frdays(cls, fr_days):
        return datetime.now() - timedelta(fr_days)

    '''
    获取当前时间前hours小时的日期对象
    '''
    @classmethod
    def get_frhours_from_now(cls, hours):
        hours = int(hours)
        t = datetime.now().time() - hours * 60 * 60
        return time.localtime(t)

    '''
    获取当前时间前几分钟的日期对象
    '''
    @classmethod
    def get_frmins_from_now(cls, mins):
        mins = int(mins)
        t = datetime.now().time() - mins * 60
        return time.localtime(t)


    '''
    日期有可能是'yyyy/mm/dd' 也有可能是'3天前'，也有可能是'9小时前'需要进一步判断和计算
    '''
    @classmethod
    def tran_string_to_date(cls, date_string):
        date = None
        try:
            date = datetime.strptime(date_string, '%Y/%m/%d')
        except Exception as err:
            pass

        try:
            # 有可能是 ‘n天前’
            date = cls.get_date_with_frdays(int(date_string.replace("天前", "")))
        except Exception as err:
            pass

        try:
            # 有可能是 ‘n小时前’
            date = cls.get_frhours_from_now(int(date_string.replace("小时前", "")))
        except Exception as err:
            pass

        try:
            # 有可能是 ‘n分钟前’
            date = cls.get_frmins_from_now(int(date_string.replace("分钟前", "")))
        except Exception as err:
            pass

        return date

    '''
    对content里的HTML标签进行转义，避免存储到Mysql里时发生异常
    '''
    @classmethod
    def tran_tag_in_content(cls, content):
        content = content.replace('"', "&quot;").replace('<', "&lt;").replace('>', "&gt;")
        return content

    '''
    根据URL组织一个HOST参数，将会放到header里
    '''

    @classmethod
    def get_host_from_url(cls, url):
        array = url.replace("http://", "").replace("https://", "").split('/')
        if len(array) > 0:
            return array[0]
        return None

    '''
    从response和URL里的内容中组织一个完事的博文Item对象
    '''

    @classmethod
    def compose_article_count(cls, article_id, click_count, comment_count):
        item = HexunblogItem()
        item["title"] = None
        item["article_id"] = article_id
        item["click_count"] = click_count
        item["comment_count"] = comment_count
        return item

    '''
    从response和URL里的内容中组织一个完事的博文Item对象
    '''
    @classmethod
    def compose_article_item(cls, response, url):

        #######################################################################
        # 先对HTML进行分析，目标：标题、发布日期、是否原创、点击量、回复数、标签、版块 #
        #######################################################################

        # 标题 <span class="ArticleTitleText"><a>数据库新动向 Oracle 与微软割据局面产生</a></div>
        title_list = response.xpath(".//span[@class='ArticleTitleText']/a/text()").extract()

        # 发布日期和是否原创<div class="ArticleTitle">里的文字内容中去分析
        pd_list = response.xpath(".//div[@class='ArticleTitle']/text()").extract()

        # 标签：<div class="ArticleTag">里的第一个<a>标签的文字内容
        tag_list = response.xpath(".//div[@class='ArticleTag']/a/text()").extract()

        # 所属版块：<div class="ArticleClass">里的第一个<a>标签的文字内容
        class_list = response.xpath(".//div[@class='ArticleClass']/a/text()").extract()

        #######################################################################
        # 先对分析拿到的Node进行判断和字符串处理，最终组织成一个对象：HexunblogItem    #
        #######################################################################
        item = HexunblogItem()

        # 点击量、回帖量以及作者信息为异步加载信息，所以需要先获取用户ID，博客ID以及博文ID
        # 从HTML里获取 user_id, article_id, blog_id
        # showArticleComments("25914240", "113881877", "18594907", 1);
        html_content = response.body_as_unicode()
        pattern = re.compile(r'.*?showArticleComments.*?"(.*?)", "(.*?)", "(.*?)".*?')
        rst = re.findall(pattern, html_content)
        if rst and len(rst) == 1 and len(rst[0]) == 3:
            item["user_id"] = rst[0][0]
            item["article_id"] = rst[0][1]
            item["blog_id"] = rst[0][2]

        # url, 从URL中获取ID： http://14755920.blog.hexun.com/113881397_d.html
        if url and len(url) > 0:
            item["url"] = url

        # 发布日期 ，判断是否原创
        if pd_list and len(pd_list) > 0:
            pd_str = "".join(pd_list).replace("\r", "").replace("\n", "").replace("·", "")
            pd_str = pd_str.replace("[", "").replace("]", "")

            if pd_str.find("原创") > 0:
                item["original"] = 1
            else:
                item["original"] = 0
            pd_str = pd_str.replace("原创", "").replace("转贴", "")
            item["publish_date"] = pd_str.lstrip().rstrip()

        # 标题
        if title_list and len(title_list) > 0:
            item["title"] = title_list[0].replace("\r", "").replace("\n", "").replace(" ", "").replace("·", "")
        else:
            item["title"] = "无标题"

        # 标签
        if tag_list and len(tag_list) > 0:
            item["tag_name"] = tag_list[0].replace(" ", "")
        else:
            item["tag_name"] = ""

        # 所属版块
        if class_list and len(class_list) > 0:
            item["class_name"] = class_list[0]
        else:
            item["class_name"] = ""
        item["author"] = "佚名作者"
        item["click_count"] = 0
        item["comment_count"] = 0
        return item
